In [1]:
#importação das bibliotecas iniciais
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
%matplotlib inline 
In [2]:
#carregamento da base de dados da usina
geracao = pd.read_csv('dados\Plant_1_Generation_Data.csv')
clima = pd.read_csv('dados\Plant_1_Weather_Sensor_Data.csv')
In [3]:
#escopo geral da base de dados
geracao.head()
clima.head()
Out[3]:
DATE_TIME PLANT_ID SOURCE_KEY AMBIENT_TEMPERATURE MODULE_TEMPERATURE IRRADIATION
0 2020-05-15 00:00:00 4135001 HmiyD2TTLFNqkNe 25.184316 22.857507 0.0
1 2020-05-15 00:15:00 4135001 HmiyD2TTLFNqkNe 25.084589 22.761668 0.0
2 2020-05-15 00:30:00 4135001 HmiyD2TTLFNqkNe 24.935753 22.592306 0.0
3 2020-05-15 00:45:00 4135001 HmiyD2TTLFNqkNe 24.846130 22.360852 0.0
4 2020-05-15 01:00:00 4135001 HmiyD2TTLFNqkNe 24.621525 22.165423 0.0
In [4]:
#descrição do comportamento das varáveis dos dados
geracao.describe()
clima.describe()
Out[4]:
PLANT_ID AMBIENT_TEMPERATURE MODULE_TEMPERATURE IRRADIATION
count 3182.0 3182.000000 3182.000000 3182.000000
mean 4135001.0 25.531606 31.091015 0.228313
std 0.0 3.354856 12.261222 0.300836
min 4135001.0 20.398505 18.140415 0.000000
25% 4135001.0 22.705182 21.090553 0.000000
50% 4135001.0 24.613814 24.618060 0.024653
75% 4135001.0 27.920532 41.307840 0.449588
max 4135001.0 35.252486 65.545714 1.221652
In [22]:
a = geracao.isnull().sum()
b = clima.isnull().sum()
print(f'Quantidade de elemesntos nulo da geração:\n{a}')
print(f'Quantidade de elemesntos nulo ddo clima:\n{b}')
Quantidade de elemesntos nulo da geração:
DATE_TIME      0
PLANT_ID       0
SOURCE_KEY     0
DC_POWER       0
AC_POWER       0
DAILY_YIELD    0
TOTAL_YIELD    0
dtype: int64
Quantidade de elemesntos nulo ddo clima:
DATE_TIME              0
PLANT_ID               0
SOURCE_KEY             0
AMBIENT_TEMPERATURE    0
MODULE_TEMPERATURE     0
IRRADIATION            0
dtype: int64
In [5]:
#Convertendo as datas para DateTime
geracao['DATE_TIME'] = pd.to_datetime(geracao['DATE_TIME'],format = '%d-%m-%Y %H:%M')
clima['DATE_TIME'] = pd.to_datetime(clima['DATE_TIME'],format = '%Y-%m-%d %H:%M')
In [6]:
#criando colunas para tempo e data
geracao['DATE'] = geracao['DATE_TIME'].apply(lambda x:x.date())
geracao['TIME'] = geracao['DATE_TIME'].apply(lambda x:x.time())
clima['DATE'] = clima['DATE_TIME'].apply(lambda x:x.date())
clima['TIME'] = clima['DATE_TIME'].apply(lambda x:x.time())
In [7]:
geracao.tail()
clima.tail()
Out[7]:
DATE_TIME PLANT_ID SOURCE_KEY AMBIENT_TEMPERATURE MODULE_TEMPERATURE IRRADIATION DATE TIME
3177 2020-06-17 22:45:00 4135001 HmiyD2TTLFNqkNe 22.150570 21.480377 0.0 2020-06-17 22:45:00
3178 2020-06-17 23:00:00 4135001 HmiyD2TTLFNqkNe 22.129816 21.389024 0.0 2020-06-17 23:00:00
3179 2020-06-17 23:15:00 4135001 HmiyD2TTLFNqkNe 22.008275 20.709211 0.0 2020-06-17 23:15:00
3180 2020-06-17 23:30:00 4135001 HmiyD2TTLFNqkNe 21.969495 20.734963 0.0 2020-06-17 23:30:00
3181 2020-06-17 23:45:00 4135001 HmiyD2TTLFNqkNe 21.909288 20.427972 0.0 2020-06-17 23:45:00
In [15]:
#GeraçãodosMódulosFV
geracao_dia = geracao.copy()
geracao_dia = geracao_dia.groupby(['TIME','SOURCE_KEY'])['DAILY_YIELD'].mean().unstack()
plt.figure(figsize=(10,5))
geracao_dia.iloc[:,0:1].plot()
plt.title('Potência DC em um inversor Usina 1')
plt.ylabel('kWh')
plt.xlabel('Tempo')
Out[15]:
Text(0.5, 0, 'Tempo')
<Figure size 720x360 with 0 Axes>
2021-05-31T17:17:53.167773 image/svg+xml Matplotlib v3.3.4, https://matplotlib.org/
In [17]:
conv_Inv= geracao.groupby(['SOURCE_KEY']).mean()
eficiencia= conv_Inv['AC_POWER']*1000/conv_Inv['DC_POWER']
eficiencia.plot(figsize=(15,5), style='o--')
plt.axhline(eficiencia.mean(),linestyle='--',color='green')
plt.title('Eficiência dos Inversores', size=20)
plt.ylabel('Eficiência (%)')
plt.xlabel('ID dos inversores')
Out[17]:
Text(0, 0.5, 'Eficiência %')
2021-05-31T17:20:17.143370 image/svg+xml Matplotlib v3.3.4, https://matplotlib.org/
In [25]:
#potencia CC gerado pelos módulos
geracao_cc = geracao.copy()
geracao_cc = geracao_cc.groupby(['TIME','DATE'])['DC_POWER'].sum().unstack()

fig,ax=plt.subplots(ncols=2,nrows=1,dpi=200,figsize=(20,5))
ax[0].set_title('Potência DC em um inversor A da Usina 1')
ax[0].set_ylabel('kW')
ax[0].set_xlabel('Tempo')
ax[1].set_title('Potência DC em um inversor B da Usina 1')
ax[1].set_ylabel('kW')
ax[1].set_xlabel('Tempo')
geracao_cc.iloc[:,0:1].plot(ax=ax[0],linewidth = 5)
geracao_cc.iloc[:,1:2].plot(ax=ax[1],linewidth = 5,color='orange')
Out[25]:
<AxesSubplot:title={'center':'Potência DC em um inversor B da Usina 1'}, xlabel='TIME', ylabel='kW'>
2021-05-31T17:29:07.785902 image/svg+xml Matplotlib v3.3.4, https://matplotlib.org/
In [26]:
#potencia AC convertido pelo inversor
geracao_ac = geracao.copy()
geracao_ac = geracao_ac.groupby(['TIME','DATE'])['AC_POWER'].sum().unstack()

fig,ax=plt.subplots(ncols=2,nrows=1,dpi=200,figsize=(20,5))
ax[0].set_title('Potência AC em um inversor A da Usina 1')
ax[0].set_ylabel('kW')
ax[0].set_xlabel('Tempo')
ax[1].set_title('Potência AC em um inversor B da Usina 1')
ax[1].set_ylabel('kW')
ax[1].set_xlabel('Tempo')
geracao_ac.iloc[:,0:1].plot(ax=ax[0],linewidth = 5)
geracao_ac.iloc[:,1:2].plot(ax=ax[1],linewidth = 5,color='orange')
Out[26]:
<AxesSubplot:title={'center':'Potência AC em um inversor B da Usina 1'}, xlabel='TIME', ylabel='kW'>
2021-05-31T17:29:12.737705 image/svg+xml Matplotlib v3.3.4, https://matplotlib.org/
In [28]:
#Agrupando os dados pela data
geracao_diaria = geracao.groupby(['DATE_TIME'],as_index=False).sum()
geracao_diaria.head()
Out[28]:
DATE_TIME PLANT_ID DC_POWER AC_POWER DAILY_YIELD TOTAL_YIELD
0 2020-05-15 00:00:00 86835021 0.0 0.0 0.0 143581676.0
1 2020-05-15 00:15:00 86835021 0.0 0.0 0.0 143581676.0
2 2020-05-15 00:30:00 86835021 0.0 0.0 0.0 143581676.0
3 2020-05-15 00:45:00 86835021 0.0 0.0 0.0 143581676.0
4 2020-05-15 01:00:00 90970022 0.0 0.0 0.0 150761642.0
In [30]:
#selecionando as variaveis de estudo
geracao_select = geracao_diaria[['DATE_TIME','DC_POWER','AC_POWER','DAILY_YIELD']]
geracao_select[45:50]
Out[30]:
DATE_TIME DC_POWER AC_POWER DAILY_YIELD
45 2020-05-15 11:15:00 160301.226190 15683.713690 47752.761904
46 2020-05-15 11:30:00 155409.160714 15207.283929 51726.053571
47 2020-05-15 11:45:00 167668.196427 16401.589286 55271.107142
48 2020-05-15 12:00:00 155821.696428 15250.808333 59620.946429
49 2020-05-15 12:15:00 209569.398819 20477.017856 63932.303572
In [39]:
#drop da chave id da usina e do inversor que serão insgnificantes para a predição
clima_drop = clima.drop(['PLANT_ID', 'SOURCE_KEY'], axis=1)
clima_drop.head()
Out[39]:
DATE_TIME AMBIENT_TEMPERATURE MODULE_TEMPERATURE IRRADIATION DATE TIME
0 2020-05-15 00:00:00 25.184316 22.857507 0.0 2020-05-15 00:00:00
1 2020-05-15 00:15:00 25.084589 22.761668 0.0 2020-05-15 00:15:00
2 2020-05-15 00:30:00 24.935753 22.592306 0.0 2020-05-15 00:30:00
3 2020-05-15 00:45:00 24.846130 22.360852 0.0 2020-05-15 00:45:00
4 2020-05-15 01:00:00 24.621525 22.165423 0.0 2020-05-15 01:00:00
In [40]:
#juntando dados de geração e clima
usine = pd.merge(geracao_select,clima_drop, how='inner', on='DATE_TIME')
usine_no_time = usine.drop(['DATE','TIME'],axis =1)
usine_no_time.head()
Out[40]:
DATE_TIME DC_POWER AC_POWER DAILY_YIELD AMBIENT_TEMPERATURE MODULE_TEMPERATURE IRRADIATION
0 2020-05-15 00:00:00 0.0 0.0 0.0 25.184316 22.857507 0.0
1 2020-05-15 00:15:00 0.0 0.0 0.0 25.084589 22.761668 0.0
2 2020-05-15 00:30:00 0.0 0.0 0.0 24.935753 22.592306 0.0
3 2020-05-15 00:45:00 0.0 0.0 0.0 24.846130 22.360852 0.0
4 2020-05-15 01:00:00 0.0 0.0 0.0 24.621525 22.165423 0.0
In [41]:
#insight da relação entre as variaveis
sns.pairplot(usine[['DC_POWER','AC_POWER','DAILY_YIELD','AMBIENT_TEMPERATURE','MODULE_TEMPERATURE','IRRADIATION']])
Out[41]:
<seaborn.axisgrid.PairGrid at 0x247a35bbd30>
2021-05-31T17:40:02.676685 image/svg+xml Matplotlib v3.3.4, https://matplotlib.org/
In [43]:
#observando o comportamento das variaveis de clima e da geração dos módulos FV
usine_clima = usine.copy()
clima_cc = usine_clima.groupby(['TIME']).mean()

fig,ax=plt.subplots(ncols=2,nrows=2,dpi=200,figsize=(15,5))
clima_cc['IRRADIATION'].plot(ax=ax[0,0])
clima_cc['AMBIENT_TEMPERATURE'].plot(ax=ax[0,1])
clima_cc['MODULE_TEMPERATURE'].plot(ax=ax[1,0])
clima_cc['DC_POWER'].plot(ax=ax[1,1])

ax[0,0].set_ylabel('IRRADIATION')
ax[0,1].set_ylabel('AMBIENT TEMPERATURE')
ax[1,0].set_ylabel('MODULE TEMPERATURE')
ax[1,1].set_ylabel('DC POWER')
Out[43]:
Text(0, 0.5, 'DC POWER')
2021-05-31T17:41:01.865124 image/svg+xml Matplotlib v3.3.4, https://matplotlib.org/
In [44]:
#Correlação entre as variaveis da usina para a escolha da mais apropriada para geração DC
usine_no_time.columns = ['DATE_TIME','DC_POWER','AC_POWER','DAILY_YIELD','AMBIENT','MODULE','IRRADIATION']
one_correlation = usine_no_time[['DC_POWER','AC_POWER','DAILY_YIELD','AMBIENT','MODULE','IRRADIATION']]
corr = one_correlation.corr()

fig_dims = (2, 2) 
sns.heatmap(round(corr,2), annot=True, mask=(np.triu(corr,+1)))
plt.savefig('correla.png',format = 'png')
2021-05-31T17:42:13.286037 image/svg+xml Matplotlib v3.3.4, https://matplotlib.org/
In [45]:
#após a escolha das variaveis com maior correlação, separação final da base de dados
base = usine[['DC_POWER','MODULE_TEMPERATURE','IRRADIATION']]
base.describe()
resultados = usine_one[['DC_POWER','DATE_TIME']]
In [ ]: